- 3.0 Introduction
- 3.1 Intervals & Tests
- 3.2 Residuals & Assumptions
- 3.3 Let's practice more
2018-05-06
set.seed(1L) Aliens <- simulate_Aliens_GLM() mod_gauss <- glm(size ~ humans_eaten, family = gaussian(), data = Aliens) mod_poiss <- glm(eggs ~ humans_eaten, family = poisson(), data = Aliens) mod_binar <- glm(happy ~ humans_eaten, family = binomial(), data = Aliens) mod_binom <- glm(cbind(blue_eyes, pink_eyes) ~ humans_eaten, family = binomial(), data = Aliens)
plot(model) ?par(mfrow = c(2, 2)) plot(mod_poiss)
plot(model) ?par(mfrow = c(2, 2)) plot(mod_binar)
plot(model) ?par(mfrow = c(2, 2)) plot(mod_binom)
rbind( residuals(mod_poiss, type = "deviance")[1:2], residuals(mod_poiss, type = "pearson")[1:2], residuals(mod_poiss, type = "working")[1:2], residuals(mod_poiss, type = "response")[1:2])
## 1 2 ## [1,] 0.2639365 -1.2774761 ## [2,] 0.2769535 -0.9033120 ## [3,] 0.3179479 -1.0000000 ## [4,] 0.2412447 -0.8159726
rbind( residuals(mod_binom, type = "deviance")[1:2], residuals(mod_binom, type = "pearson")[1:2], residuals(mod_binom, type = "working")[1:2], residuals(mod_binom, type = "response")[1:2])
## 1 2 ## [1,] 1.4087111 -0.8875765 ## [2,] 1.3994400 -0.9179635 ## [3,] 1.0629762 -0.6653824 ## [4,] 0.2632007 -0.1407139
Note 1: we can also compute partial residuals, which we shall see later.
Note 2: this is also true for LM and gaussian(link = "identity") GLM, but it makes no difference in such case (except for partial residuals).
These are the residuals that we saw in LM: observed - predicted. Simple but pretty much useless.
rbind(residuals(mod_gauss, type = "response")[1:2],
(mod_gauss$y - mod_gauss$fitted.values)[1:2])
## 1 2 ## [1,] 2.434285 -2.804164 ## [2,] 2.434285 -2.804164
rbind(residuals(mod_poiss, type = "response")[1:2],
(mod_poiss$y - mod_poiss$fitted.values)[1:2])
## 1 2 ## [1,] 0.2412447 -0.8159726 ## [2,] 0.2412447 -0.8159726
rbind(residuals(mod_binom, type = "response")[1:2],
(mod_binom$y - mod_binom$fitted.values)[1:2])
## 1 2 ## [1,] 0.2632007 -0.1407139 ## [2,] 0.2632007 -0.1407139
These are response residuals scaled by the square root of the variance function and accounting for their prior weights.
variances <- poisson()$variance(mod_poiss$fitted.values)
rbind(residuals(mod_poiss, type = "pearson")[1:2],
(residuals(mod_poiss, type = "response") * sqrt(mod_poiss$prior.weights / variances))[1:2])
## 1 2 ## [1,] 0.2769535 -0.903312 ## [2,] 0.2769535 -0.903312
variances <- binomial()$variance(mod_binom$fitted.values)
rbind(residuals(mod_binom, type = "pearson")[1:2],
(residuals(mod_binom, type = "response") * sqrt(mod_binom$prior.weights / variances))[1:2])
## 1 2 ## [1,] 1.39944 -0.9179635 ## [2,] 1.39944 -0.9179635
These residuals are often use to check the quality of the fit.
The sum of the squared Pearson residuals gives the Pearson \(\chi^2\) statistic of goodness of fit:
(X <- sum(residuals(mod_binom, type = "pearson")^2))
## [1] 88.77233
This statistic is sometimes used to test the goodness of fit of the model (= assess the ability of a model to fit the data with no alternative in mind) or to test for overdispersion (= is the residual variance compatible with the variance function), but I would not recommend you to use such test because its usage is often not optimal (it requires a lot of replications within each grouping, the power is poor, and the coverage is often poor).
pchisq(X, mod_poiss$df.residual, lower.tail = FALSE) ## Pearson goodness of fit test
## [1] 0.7366322
X / mod_poiss$df.residual ## measure of overdispersion
## [1] 0.9058401
These are the residuals that are minimized after the fit.
Contrary to those computed by family()$dev.resids they are here not squared.
residuals(mod_poiss, type = "deviance")[1:2]
## 1 2 ## 0.2639365 -1.2774761
rbind((residuals(mod_poiss, type = "deviance")[1:2])^2,
with(mod_poiss, poisson()$dev.resids(y = y, mu = fitted.values, wt = prior.weights))[1:2])
## 1 2 ## [1,] 0.06966248 1.631945 ## [2,] 0.06966248 1.631945
The sum of the squared deviance residuals gives the residual (unscaled) deviance of the model fit:
c(sum(residuals(mod_gauss, type = "deviance")^2), deviance(mod_gauss))
## [1] 2172.918 2172.918
c(sum(residuals(mod_poiss, type = "deviance")^2), deviance(mod_poiss))
## [1] 116.8588 116.8588
c(sum(residuals(mod_binom, type = "deviance")^2), deviance(mod_binom))
## [1] 92.83338 92.83338
These residuals are sometimes also used to check the quality of the fit, in the exact same fashion as the Pearson's residuals, but again (and for the same reasons), I would not recommend you to do that.
For mod_poiss
p <- replicate(1000, {
d <- simulate_Aliens_GLM()
m <- update(mod_poiss, data = d)
X <- sum(residuals(m, type = "pearson")^2)
pchisq(X, m$df.residual, lower.tail = FALSE)
})
plot(ecdf(p))
abline(0, 1, lty = 2, col = "green")
p <- replicate(1000, {
d <- simulate_Aliens_GLM()
m <- update(mod_poiss, data = d)
dev <- deviance(m)
pchisq(dev, m$df.residual, lower.tail = FALSE)
})
plot(ecdf(p))
abline(0, 1, lty = 2, col = "green")
For mod_binar
p <- replicate(1000, {
d <- simulate_Aliens_GLM()
m <- update(mod_binar, data = d)
X <- sum(residuals(m, type = "pearson")^2)
pchisq(X, m$df.residual, lower.tail = FALSE)
})
plot(ecdf(p))
abline(0, 1, lty = 2, col = "green")
p <- replicate(1000, {
d <- simulate_Aliens_GLM()
m <- update(mod_binar, data = d)
dev <- deviance(m)
pchisq(dev, m$df.residual, lower.tail = FALSE)
})
plot(ecdf(p))
abline(0, 1, lty = 2, col = "green")
For mod_binom
p <- replicate(1000, {
d <- simulate_Aliens_GLM()
m <- update(mod_binom, data = d)
X <- sum(residuals(m, type = "pearson")^2)
pchisq(X, m$df.residual, lower.tail = FALSE)
})
plot(ecdf(p))
abline(0, 1, lty = 2, col = "green")
p <- replicate(1000, {
d <- simulate_Aliens_GLM()
m <- update(mod_binom, data = d)
dev <- deviance(m)
pchisq(dev, m$df.residual, lower.tail = FALSE)
})
plot(ecdf(p))
abline(0, 1, lty = 2, col = "green")
These are the residuals used during the last iteration of the iterative fitting procedure. Useless otherwise.
rbind(mod_poiss$residuals[1:2],
((mod_poiss$y - mod_poiss$fitted.values)/poisson()$mu.eta(mod_poiss$linear.predictors))[1:2])
## 1 2 ## [1,] 0.3179479 -1 ## [2,] 0.3179479 -1
rbind(mod_binom$residuals[1:2],
((mod_binom$y - mod_binom$fitted.values)/binomial()$mu.eta(mod_binom$linear.predictors))[1:2])
## 1 2 ## [1,] 1.062976 -0.6653824 ## [2,] 1.062976 -0.6653824
These are residuals expressed at the level of each predictor.
mod_UK_small <- glm(milk ~ sex + mother_weight + cigarettes, data = UK[1:10, ], family = poisson()) residuals(mod_UK_small, type = "partial")
## sex mother_weight cigarettes ## 1 -2.668484 -2.0040869 -2.1344037 ## 2 1.430742 -1.6589504 -0.2767289 ## 3 2.270524 -0.9540451 0.5630537 ## 4 -2.668484 -0.3855587 -2.1344037 ## 6 2.815065 -0.4095045 0.7294598 ## 7 -1.311385 0.1622767 3.0040416 ## 8 -1.626415 4.2981986 -1.0923348 ## 9 -2.668484 -0.6553134 0.1344037 ## 10 2.281588 -0.5383491 -0.9384206 ## attr(,"constant") ## [1] -0.3958377
These are residuals expressed at the level of each predictor.
Computation:
(p <- predict(mod_UK_small, type = "terms")) ## prediction expressed per predictor
## sex mother_weight cigarettes ## 1 -1.668484 -1.0040869 -1.1344037 ## 2 2.085605 -1.0040869 0.3781346 ## 3 2.085605 -1.1389643 0.3781346 ## 4 -1.668484 0.6144413 -1.1344037 ## 6 2.085605 -1.1389643 0.0000000 ## 7 -1.668484 -0.1948228 2.6469421 ## 8 -1.668484 4.2561297 -1.1344037 ## 9 -1.668484 0.3446866 1.1344037 ## 10 2.085605 -0.7343322 -1.1344037 ## attr(,"constant") ## [1] -0.3958377
c(sum(p[1, ]) + attr(p, "constant"), predict(mod_UK_small, type = "link")[1])
## 1 ## -4.202812 -4.202812
These are residuals expressed at the level of each predictor.
Computation:
rbind((p + residuals(mod_UK_small, type = "working"))[1, ],
residuals(mod_UK_small, type = "partial")[1, ])
## sex mother_weight cigarettes ## [1,] -2.668484 -2.004087 -2.134404 ## [2,] -2.668484 -2.004087 -2.134404
Partial residuals can be use to check for departure from linearity (concerns quantitative predictors).
library(car) par(mfrow = c(1, 3)) crPlots(mod_UK_small, terms = ~ sex) crPlots(mod_UK_small, terms = ~ mother_weight) crPlots(mod_UK_small, terms = ~ cigarettes)
mod_UK <- glm(milk ~ sex + mother_weight + cigarettes, data = UK, family = poisson()) par(mfrow = c(1, 3)) crPlots(mod_UK, terms = ~ sex) crPlots(mod_UK, terms = ~ mother_weight) crPlots(mod_UK, terms = ~ cigarettes)
set.seed(1L) s <- simulate(mod_poiss, 1000) r <- sapply(s, function(i) i + runif(nrow(mod_poiss$model), min = -0.5, max = 0.5)) hist(r[1, ], main = "distrib of first fitted value", nclass = 30) abline(v = mod_poiss$y[1] + runif(1, min = -0.5, max = 0.5), col = "red", lwd = 2, lty = 2)
plot(ecdf1 <- ecdf(r[1, ]), main = "cdf of first fitted value") noise <- runif(1, min = -0.5, max = 0.5) simulated_residual_1 <- ecdf1(mod_poiss$y[1] + noise) segments(x0 = mod_poiss$y[1] + noise, y0 = 0, y1 = simulated_residual_1, col = "red", lwd = 2) arrows(x0 = mod_poiss$y[1] + noise, x1 = -1, y0 = simulated_residual_1, col = "red", lwd = 2)
plot(ecdf2 <- ecdf(r[2, ]), main = "cdf of second fitted value") noise <- runif(1, min = -0.5, max = 0.5) simulated_residual_2 <- ecdf2(mod_poiss$y[2] + noise) segments(x0 = mod_poiss$y[2] + noise, y0 = 0, y1 = simulated_residual_2, col = "red", lwd = 2) arrows(x0 = mod_poiss$y[2] + noise, x1 = -1, y0 = simulated_residual_2, col = "red", lwd = 2)
simulated_residuals <- rep(NA, nrow(mod_poiss$model))
for (i in 1:nrow(mod_poiss$model)) {
ecdf_fn <- ecdf(r[i, ])
simulated_residuals[i] <- ecdf_fn(mod_poiss$y[i] + runif(1, min = -0.5, max = 0.5))
}
plot(simulated_residuals)
plot(ecdf(simulated_residuals))
DHARMalibrary(DHARMa) mod_poiss_simres <- simulateResiduals(mod_poiss) plot(mod_poiss_simres)
DHARMamod_binar_simres <- simulateResiduals(mod_binar) plot(mod_binar_simres)
DHARMamod_binom_simres <- simulateResiduals(mod_binom) plot(mod_binom_simres)
DHARMamod_UK_simres <- simulateResiduals(mod_UK) plot(mod_UK_simres)
A variant based on the parametric bootstrap takes more time to run (and is more fragile to convergence issues), but is likely to be more accurate when it works.
In this case, the observed residuals are compared to those produced by models refitted on the simulated response.
mod_UK_simres2 <- simulateResiduals(mod_UK, refit = TRUE, n = 1000)
but we may try to
set.seed(1L) Aliens2 <- data.frame(humans_eaten = runif(100, min = 0, max = 15)) Aliens2$eggs <- rpois( n = 100, lambda = exp(1 + 0.2 * Aliens2$humans_eaten - 0.02 * Aliens2$humans_eaten^2)) mod_poiss2bad <- glm(eggs ~ humans_eaten, data = Aliens2, family = poisson()) ## mispecified model (mod_poiss2good <- glm(eggs ~ poly(humans_eaten, 2, raw = TRUE), data = Aliens2, family = poisson())) ## good model
## ## Call: glm(formula = eggs ~ poly(humans_eaten, 2, raw = TRUE), family = poisson(), ## data = Aliens2) ## ## Coefficients: ## (Intercept) poly(humans_eaten, 2, raw = TRUE)1 poly(humans_eaten, 2, raw = TRUE)2 ## 1.18654 0.13015 -0.01463 ## ## Degrees of Freedom: 99 Total (i.e. Null); 97 Residual ## Null Deviance: 137.5 ## Residual Deviance: 95.12 AIC: 376.7
plot(I(1 + 0.2 * Aliens2$humans_eaten - 0.02 * Aliens2$humans_eaten^2) ~ Aliens2$humans_eaten,
ylab = "eta", ylim = c(-1, 2))
data.for.pred <- data.frame(humans_eaten = 0:15)
points(predict(mod_poiss2good, newdata = data.for.pred) ~ I(0:15), col = "blue")
points(predict(mod_poiss2bad, newdata = data.for.pred) ~ I(0:15), col = "red")
plot(exp(1 + 0.2 * Aliens2$humans_eaten - 0.02 * Aliens2$humans_eaten^2) ~ Aliens2$humans_eaten,
ylab = "predicted number of eggs", ylim = c(0, 6))
points(predict(mod_poiss2good, newdata = data.for.pred, type = "response") ~ I(0:15), col = "blue")
points(predict(mod_poiss2bad, newdata = data.for.pred, type = "response") ~ I(0:15), col = "red")
crPlots(mod_poiss2bad, terms = ~ humans_eaten)
crPlots(mod_poiss2good)
plot(s_bad <- simulateResiduals(mod_poiss2bad, refit = TRUE, n = 1000))
plot(s_good <- simulateResiduals(mod_poiss2good, refit = TRUE, n = 1000))
## Warning in simulateResiduals(mod_poiss2good, refit = TRUE, n = 1000): DHARMa::simulateResiduals warning: on refit = T, ## at least one of the refitted models produced an error. Inspect the refitted model values. Results may not be reliable.
## Warning in simulateResiduals(mod_poiss2good, refit = TRUE, n = 1000): There were 999 of 1000 duplicate parameter ## estimates in the refitted models. This may hint towards a problem with optimizer convergence in the fitted models. ## Results are likely not reliable. The suggested action is to not use the refitting procedure, and diagnose with tools ## available for the normal (not refitted) simulated residuals. If you absolutely require the refitting procedure, try ## changing tolerance / iterations in the optimizer settings.
## Error in ecdf(out$refittedResiduals[i, ] + runif(out$nSim, -0.5, 0.5)): 'x' must have 1 or more non-missing values
We try again!
plot(s_good <- simulateResiduals(mod_poiss2good, refit = FALSE))
testUniformity(s_bad)
## ## One-sample Kolmogorov-Smirnov test ## ## data: simulationOutput$scaledResiduals ## D = 0.049, p-value = 0.97 ## alternative hypothesis: two-sided
testUniformity(s_good)
## ## One-sample Kolmogorov-Smirnov test ## ## data: simulationOutput$scaledResiduals ## D = 0.062, p-value = 0.8367 ## alternative hypothesis: two-sided
library(mgcv) mod_poiss2GAM <- gam(eggs ~ s(humans_eaten), data = Aliens2, family = poisson()) plot(mod_poiss2GAM, shade = TRUE, residuals = FALSE, shade.col = "red") ## on the scale of the linear predictor!
You can run the Durbin Watson test on the simulated residuals:
testTemporalAutocorrelation(s_good, time = mod_poiss2good$fitted.values, plot = FALSE)
## ## Durbin-Watson test ## ## data: simulationOutput$scaledResiduals ~ 1 ## DW = 2.2265, p-value = 0.8737 ## alternative hypothesis: true autocorrelation is greater than 0
You can run the Durbin Watson test on the simulated residuals:
testTemporalAutocorrelation(s_good, time = Aliens2$humans_eaten, plot = FALSE)
## ## Durbin-Watson test ## ## data: simulationOutput$scaledResiduals ~ 1 ## DW = 2.2876, p-value = 0.9268 ## alternative hypothesis: true autocorrelation is greater than 0
Note: As for LM it is good practice to test for the lack of serial autocorrelation along all your predictors and not just along the fitted values.
Survival times series that are discrete and complete can be analysed using a binomial (binary!) GLM.
Example: the study of the influence of rainfall on mortality (here A dies at 5 yrs old and B at 3 yrs old).
data.frame(age = c(1:5, 1:5),
id = c(rep("A", 5), rep("B", 5)),
death = c(0, 0, 0, 0, 1, 0, 0, 1, NA, NA),
annual_rain = c(100, 120, 310, 50, 200, 45, 100, 320, 100, 120))
## age id death annual_rain ## 1 1 A 0 100 ## 2 2 A 0 120 ## 3 3 A 0 310 ## 4 4 A 0 50 ## 5 5 A 1 200 ## 6 1 B 0 45 ## 7 2 B 0 100 ## 8 3 B 1 320 ## 9 4 B NA 100 ## 10 5 B NA 120
In such a case, a random effect for the identity of the individual is not needed! So there is no need for mixed model if individuals are all equally independent from each other.
p <- seq(0, 1, 0.1) lambda <- 0:10 theta <- 0:10 v_b <- binomial()$variance(p) v_p <- poisson()$variance(lambda) v_G <- Gamma()$variance(theta) par(mfrow = c(1, 3), las = 2) plot(v_b ~ p); plot(v_p ~ lambda); plot(v_G ~ theta)
Usual suspects:
set.seed(1L) popA <- data.frame(humans_eaten = runif(50, min = 0, max = 15)) popA$eggs <- rpois(n = 50, lambda = exp(-1 + 0.05 * popA$humans_eaten)) popA$pop <- "A" popB <- data.frame(humans_eaten = runif(50, min = 0, max = 15)) popB$eggs <- rpois(n = 50, lambda = exp(-3 + 0.4 * popB$humans_eaten)) popB$pop <- "B" AliensMix <- rbind(popA, popB) (mod_poissMix <- glm(eggs ~ humans_eaten, family = poisson(), data = AliensMix))
## ## Call: glm(formula = eggs ~ humans_eaten, family = poisson(), data = AliensMix) ## ## Coefficients: ## (Intercept) humans_eaten ## -3.3414 0.3756 ## ## Degrees of Freedom: 99 Total (i.e. Null); 98 Residual ## Null Deviance: 465.7 ## Residual Deviance: 214.2 AIC: 344.9
A widely used not so good way:
cbind(disp = mod_poissMix$deviance / mod_poissMix$df.residual,
pv = pchisq(mod_poissMix$deviance, mod_poissMix$df.residual, lower.tail = FALSE))
## disp pv ## [1,] 2.185731 1.189718e-10
A slightly better way:
cbind(disp = sum(residuals(mod_poissMix, type = "pearson")^2) / mod_poissMix$df.residual,
pv = pchisq(sum(residuals(mod_poissMix, type = "pearson")^2), mod_poissMix$df.residual, lower.tail = FALSE))
## disp pv ## [1,] 2.267855 1.215556e-11
A better way (?)
r <- simulateResiduals(mod_poissMix, refit = TRUE, n = 1000) testOverdispersion(r) # stat = squared pearson residuals compared to that obtained by simulation
## ## DHARMa nonparametric overdispersion test via comparison to simulation under H0 = fitted model ## ## data: r ## dispersion = 2.2852, p-value = 0.001 ## alternative hypothesis: overdispersion
mod_poissMix2 <- glm(eggs ~ pop*humans_eaten, family = poisson(), data = AliensMix) r2 <- simulateResiduals(mod_poissMix2, refit = TRUE, n = 1000) testOverdispersion(r2) ## you can change options to test for underdispersion
## ## DHARMa nonparametric overdispersion test via comparison to simulation under H0 = fitted model ## ## data: r2 ## dispersion = 0.74896, p-value = 0.954 ## alternative hypothesis: overdispersion
Variance = \(k \times \mu\)
mod_poissMixQ <- glm(eggs ~ humans_eaten, family = quasipoisson(), data = AliensMix) summary(mod_poissMixQ)$coef
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -3.341449 0.54852447 -6.091705 2.195748e-08 ## humans_eaten 0.375603 0.04364544 8.605779 1.272120e-13
Anova(mod_poissMixQ, test = "F") ## 'F' not 'LR' because dispersion is estimated!
## Analysis of Deviance Table (Type II tests) ## ## Response: eggs ## Error estimate based on Pearson residuals ## ## Sum Sq Df F value Pr(>F) ## humans_eaten 251.54 1 110.92 < 2.2e-16 *** ## Residuals 222.25 98 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Variance = \(k \times \mu\)
It works, but we cannot do everything with these types of fit:
confint(mod_poissMixQ)
## Waiting for profiling to be done...
## 2.5 % 97.5 % ## (Intercept) -4.4891789 -2.334139 ## humans_eaten 0.2937638 0.465225
c(logLik(mod_poissMixQ), AIC(mod_poissMixQ))
## [1] NA NA
quasibinomial(), with the same limit (no likelihood)mod_poissMix <- glm(eggs ~ humans_eaten, family = poisson(), data = AliensMix) sum(residuals(mod_poissMix, type = "pearson")^2) / mod_poissMix$df.residual
## [1] 2.267855
summary(mod_poissMix)$coefficients
## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -3.341449 0.36421196 -9.174463 4.538303e-20 ## humans_eaten 0.375603 0.02897991 12.960806 2.040928e-38
MASSlibrary(MASS) mod_poissMixNB <- glm.nb(eggs ~ humans_eaten, data = AliensMix) sum(residuals(mod_poissMixNB, type = "pearson")^2) / mod_poissMix$df.residual
## [1] 0.836146
summary(mod_poissMixNB)$coefficients
## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -2.8414180 0.47988075 -5.921092 3.198110e-09 ## humans_eaten 0.3292865 0.04508882 7.303063 2.812901e-13
c(AIC(mod_poissMix), AIC(mod_poissMixNB), logLik(mod_poissMixNB))
## [1] 344.8726 280.7114 -137.3557
spaMMmod_poissMixSpaMM <- fitme(eggs ~ humans_eaten, family = poisson(), data = AliensMix) mod_poissMixNBSpaMM <- fitme(eggs ~ humans_eaten, family = spaMM::negbin(), data = AliensMix) ## namespace needed! summary(mod_poissMixNBSpaMM)
## formula: eggs ~ humans_eaten ## ML: Estimation of corrPars and NB_shape by ML. ## Estimation of fixed effects by ML. ## Estimation of NB_shape by 'outer' ML, maximizing p_v. ## Family: Neg.binomial(shape=1.017) ( link = log ) ## ------------ Fixed effects (beta) ------------ ## Estimate Cond. SE t-value ## (Intercept) -2.8414 0.47988 -5.921 ## humans_eaten 0.3293 0.04509 7.303 ## ------------- Likelihood values ------------- ## logLik ## p(h) (Likelihood): -137.3557
c(AIC(mod_poissMixSpaMM), AIC(mod_poissMixNBSpaMM), logLik(mod_poissMixNBSpaMM))
## marginal AIC: marginal AIC: p_v ## 344.8726 280.7114 -137.3557
spaMM)Replicating Poisson:
mod_poissMixCP <- glm(eggs ~ humans_eaten, family = COMPoisson(nu = 1), data = AliensMix) mod_poissMixCPSpaMM <- fitme(eggs ~ humans_eaten, family = COMPoisson(nu = 1), data = AliensMix) rbind(logLik(mod_poissMix)[[1]], logLik(mod_poissMixCP)[[1]], logLik(mod_poissMixCPSpaMM)[[1]])
## [,1] ## [1,] -170.4363 ## [2,] -189.3883 ## [3,] -190.3062
IT SHOULD ALL BE IDENTICAL (AND USED TO), BUT THERE SEEM TO BE A BUG!!
THE PATCH IS IN PROGRESS!
mod_poissMixCPSpaMM <- fitme(eggs ~ humans_eaten, family = COMPoisson(), data = AliensMix) summary(mod_poissMixCPSpaMM)
## formula: eggs ~ humans_eaten ## ML: Estimation of corrPars and COMP_nu by ML. ## Estimation of fixed effects by ML. ## Estimation of COMP_nu by 'outer' ML, maximizing p_v. ## Family: COMPoisson(nu=0.05007) ( link = loglambda ) ## ------------ Fixed effects (beta) ------------ ## Estimate Cond. SE t-value ## (Intercept) -2.1584 0.24533 -8.798 ## humans_eaten 0.1502 0.01743 8.613 ## ------------- Likelihood values ------------- ## logLik ## p(h) (Likelihood): -157.0442
c(AIC(mod_poissMixSpaMM), AIC(mod_poissMixNBSpaMM), AIC(mod_poissMixCP), AIC(mod_poissMixCPSpaMM))
## marginal AIC: marginal AIC: marginal AIC: ## 344.8726 280.7114 382.7765 318.0884
Note: here we estimate nu, so it can take (much) longer time to fit!
d <- data.frame(humans_eaten = seq(0, 15, length = 1000))
p <- predict(mod_poissMixSpaMM, newdata = d, intervals = "predVar")
plot(p ~ d$humans_eaten, ylim = range(mod_poissMixSpaMM$data$eggs), type = "l", lwd = 3)
points(attr(p, "interval")[, 1] ~ d$humans_eaten, lty = 2, type = "l")
points(attr(p, "interval")[, 2] ~ d$humans_eaten, lty = 2, type = "l")
p <- predict(mod_poissMixNBSpaMM, newdata = d, intervals = "predVar")
points(p ~ d$humans_eaten, type = "l", col = "red", lwd = 3)
points(attr(p, "interval")[, 1] ~ d$humans_eaten, lty = 2, type = "l", col = "red")
points(attr(p, "interval")[, 2] ~ d$humans_eaten, lty = 2, type = "l", col = "red")
p <- predict(mod_poissMixCPSpaMM, newdata = d, intervals = "predVar")
points(p ~ d$humans_eaten, type = "l", col = "blue", lwd = 3)
points(attr(p, "interval")[, 1] ~ d$humans_eaten, lty = 2, type = "l", col = "blue") ## BUG
points(attr(p, "interval")[, 2] ~ d$humans_eaten, lty = 2, type = "l", col = "blue") ## BUG
legend("topleft", fill = c("black", "red", "blue"),
legend = c("Poisson", "Negative Binomial", "Conway-Maxwell-Poisson "), bty = "n")
It occurs for binomial or Poisson events when too many zeros occur.
It can occur when the response results from a 2 (or more) steps process
Examples:
You may try again to use the negative binomial or the COM-Poisson distribution, but an alternative that is often best is to combine two models (during the fitting procedure, not sequentially)!
Fit an hurdle model
Fit a zero-inflation model
set.seed(1L) AliensZ <- simulate_Aliens_GLM(N = 1000) AliensZ$eggs <- AliensZ$eggs * AliensZ$happy ## unhappy Aliens loose their eggs :-( barplot(table(AliensZ$eggs))
mod_Zpoiss <- glm(eggs ~ humans_eaten, data = AliensZ, family = poisson()) r <- simulateResiduals(mod_Zpoiss, refit = TRUE, n = 1000) testZeroInflation(r, plot = FALSE)
## ## DHARMa zero-inflation test via comparison to expected zeros with simulation under H0 = fitted model ## ## data: r ## ratioObsExp = 1.0712, p-value < 2.2e-16 ## alternative hypothesis: more
mean(sum(AliensZ$eggs == 0) / sum(dpois(0, fitted(mod_Zpoiss))))
## [1] 1.072326
testZeroInflation(r, plot = TRUE)
## ## DHARMa zero-inflation test via comparison to expected zeros with simulation under H0 = fitted model ## ## data: r ## ratioObsExp = 1.0712, p-value < 2.2e-16 ## alternative hypothesis: more
mod_Znb <- glm.nb(eggs ~ humans_eaten, data = AliensZ) r <- simulateResiduals(mod_Znb, refit = TRUE, n = 1000) testZeroInflation(r, plot = FALSE)
## ## DHARMa zero-inflation test via comparison to expected zeros with simulation under H0 = fitted model ## ## data: r ## ratioObsExp = 1.0112, p-value = 0.219 ## alternative hypothesis: more
testZeroInflation(r, plot = TRUE)
## ## DHARMa zero-inflation test via comparison to expected zeros with simulation under H0 = fitted model ## ## data: r ## ratioObsExp = 1.0112, p-value = 0.219 ## alternative hypothesis: more
library(pscl)
mod_Zhurd1 <- hurdle(eggs ~ humans_eaten | humans_eaten, dist = "poisson", zero.dist = "binomial",
data = AliensZ)
mod_Zhurd2 <- hurdle(eggs ~ humans_eaten | 1, dist = "poisson", zero.dist = "binomial", data = AliensZ)
lmtest::lrtest(mod_Zhurd1, mod_Zhurd2)
## Likelihood ratio test ## ## Model 1: eggs ~ humans_eaten | humans_eaten ## Model 2: eggs ~ humans_eaten | 1 ## #Df LogLik Df Chisq Pr(>Chisq) ## 1 4 -703.87 ## 2 3 -818.64 -1 229.53 < 2.2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mean(sum(AliensZ$eggs == 0) / sum(predict(mod_Zhurd1, type = "prob")[, 1]))
## [1] 1
mod_Zzi1 <- zeroinfl(eggs ~ humans_eaten | humans_eaten, dist = "poisson", data = AliensZ) mod_Zzi2 <- zeroinfl(eggs ~ humans_eaten | 1, dist = "poisson", data = AliensZ) lmtest::lrtest(mod_Zzi1, mod_Zzi2)
## Likelihood ratio test ## ## Model 1: eggs ~ humans_eaten | humans_eaten ## Model 2: eggs ~ humans_eaten | 1 ## #Df LogLik Df Chisq Pr(>Chisq) ## 1 4 -703.46 ## 2 3 -717.75 -1 28.586 8.961e-08 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mean(sum(AliensZ$eggs == 0) / sum(predict(mod_Zzi1, type = "prob")[, 1]))
## [1] 1.00019
cbind(Poisson = AIC(mod_Zpoiss),
NegBin = AIC(mod_Znb),
Hurdle = AIC(mod_Zhurd1),
ZeroInf = AIC(mod_Zzi1))
## Poisson NegBin Hurdle ZeroInf ## [1,] 1495.434 1446.461 1415.743 1414.922
tab <- rbind(Poisson = c(mod_Zpoiss$coefficients, NA, NA),
NegBin = c(mod_Znb$coefficients, NA, NA),
Hurdle = unlist(mod_Zhurd1$coefficients), ## NOTE: the hurdle part predicts positive counts, not zeros!!
ZeroInfl = unlist(mod_Zzi1$coefficients), ## NOTE: the binary part predicts zeros, not counts!!
Truth = c(attr(AliensZ, "param.eta")$eggs, attr(AliensZ, "param.eta")$happy))
colnames(tab) <- c("Int.count", "Slope.count", "Int.bin", "Slope.bin")
tab
## Int.count Slope.count Int.bin Slope.bin ## Poisson -3.3117027 0.2520928 NA NA ## NegBin -3.4503702 0.2659359 NA NA ## Hurdle -1.0563084 0.1094210 -3.876696 0.3068416 ## ZeroInfl -0.9865184 0.1033290 2.916296 -0.2869336 ## Truth -1.0000000 0.1000000 -3.000000 0.3000000
Separation occurs when a level or combination of levels for categorical predictor, or when a particular threshold along a continuous predictor, predicts the outcomes perfectly.
From Wikipedia:
For example, if the predictor X is continuous, and the outcome y = 1 for all observed x > 2. If the outcome values are perfectly determined by the predictor (e.g., y = 0 when x ≤ 2) then the condition "complete separation" is said to occur. If instead there is some overlap (e.g., y = 0 when x < 2, but y has observed values of 0 and 1 when x = 2) then "quasi-complete separation" occurs. A 2 × 2 table with an empty cell is an example of quasi-complete separation.
set.seed(1L)
n <- 50
test <- data.frame(happy = rbinom(2*n, prob = c(rep(0, n), rep(0.75, n)), size = 1),
sp = factor(c(rep("sp1", n), rep("sp2", n))))
table(test$happy, test$sp)
## ## sp1 sp2 ## 0 50 13 ## 1 0 37
mod <- glm(happy ~ sp, data = test, family = binomial()) exp(coef(mod))
## (Intercept) spsp2 ## 3.181005e-09 8.947340e+08
library(safeBinaryRegression) ## overload the glm function mod2 <- glm(happy ~ sp, data = test, family = binomial())
## Error in glm(happy ~ sp, data = test, family = binomial()): The following terms are causing separation among the sample points: (Intercept), spsp2
detach(package:safeBinaryRegression)
library(spaMM) ## with package e1071 installed! (but not working at the moment...) mod2 <- fitme(happy ~ sp, data = test, family = binomial())
## Fits using Laplace approximation may diverge for all-or-none binomial data: ## check PQL or PQL/L methods in that case.
## Error in X.pv %*% beta_eta: requires numeric/complex matrix/vector arguments
test$happy[1] <- 1 table(test$happy, test$sp)
## ## sp1 sp2 ## 0 49 13 ## 1 1 37
mod2 <- glm(happy ~ sp, data = test, family = binomial()) exp(coef(mod2))
## (Intercept) spsp2 ## 0.02040816 139.46153738
test$happy[1] <- 0 ## restore the original data library(brglm2) mod3 <- glm(happy ~ sp, data = test, family = binomial(), method = "brglmFit") exp(coef(mod3))
## (Intercept) spsp2 ## 0.00990099 280.55555556